PANDAS ARE AWESOME


In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

Load Data from standard formats


In [2]:
df_full=pd.read_json('data.json',orient='index')
#Also CSV excel sheets etc.
print df_full.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 7734 entries, 0 to 999
Data columns (total 33 columns):
amp           7734 non-null int64
ch. 1         7734 non-null float64
ch. 10        7734 non-null float64
ch. 2         7734 non-null float64
ch. 3         7734 non-null float64
ch. 4         7734 non-null float64
ch. 5         7734 non-null float64
ch. 6         7734 non-null float64
ch. 7         7734 non-null float64
ch. 8         7734 non-null float64
ch. 9         7734 non-null float64
ch. max       7734 non-null float64
ch. mean      7734 non-null float64
ch. median    7734 non-null float64
ch. min       7734 non-null float64
d             7734 non-null float64
d_min         7734 non-null float64
duplicate     7734 non-null int64
error         6457 non-null float64
error_csd     7734 non-null float64
error_ele     7734 non-null float64
fICA_corr     6457 non-null float64
id            7734 non-null int64
r             7734 non-null float64
x             7734 non-null float64
x_csd         7734 non-null float64
x_est         6457 non-null float64
y             7734 non-null float64
y_csd         7734 non-null float64
y_est         6457 non-null float64
z             7734 non-null float64
z_csd         7734 non-null float64
z_est         6457 non-null float64
dtypes: float64(30), int64(3)
memory usage: 2.0 MB
None

In [3]:
df_full.describe()


Out[3]:
amp ch. 1 ch. 10 ch. 2 ch. 3 ch. 4 ch. 5 ch. 6 ch. 7 ch. 8 ... r x x_csd x_est y y_csd y_est z z_csd z_est
count 7734.0 7734.000000 7734.000000 7734.000000 7734.000000 7734.000000 7734.000000 7734.000000 7734.000000 7734.000000 ... 7734.000000 7734.000000 7734.000000 6457.000000 7734.000000 7734.000000 6457.000000 7734.000000 7734.000000 6457.000000
mean 1.0 0.473248 0.404817 0.408951 0.408225 0.409812 0.407284 0.469891 0.407935 0.405564 ... 1.031835 0.013495 0.020160 -0.002145 0.020345 0.019834 0.003525 0.045718 0.034310 0.047667
std 0.0 0.262362 0.238164 0.241954 0.240242 0.242271 0.239867 0.260771 0.241474 0.239006 ... 0.562420 1.674770 1.751224 1.872882 1.666181 1.732581 1.858646 1.645288 1.547203 1.782591
min 1.0 0.015638 0.013003 0.013632 0.011738 0.011845 0.012360 0.016353 0.012846 0.012116 ... 0.050085 -3.951708 -6.000000 -6.000000 -3.917189 -6.000000 -6.000000 -3.858361 -4.153846 -4.769231
25% 1.0 0.250141 0.208879 0.203517 0.209576 0.205477 0.209485 0.244741 0.205046 0.206085 ... 0.538562 -1.169688 -0.549153 -0.711864 -1.151324 -0.589831 -0.671186 -1.114794 -0.830769 -0.769231
50% 1.0 0.473839 0.384987 0.389206 0.390848 0.388492 0.387474 0.472783 0.386912 0.386980 ... 1.037853 0.001709 0.020339 0.020339 0.011867 0.020339 -0.020339 0.047713 0.276923 0.092308
75% 1.0 0.666819 0.579079 0.587956 0.586692 0.592580 0.583135 0.662097 0.587212 0.588271 ... 1.522593 1.199614 0.589831 0.711864 1.212545 0.589831 0.752542 1.215751 0.953846 0.953846
max 1.0 0.999723 0.995511 0.997420 0.995614 0.993956 0.995160 0.997164 0.996309 0.997828 ... 1.999741 3.879548 6.000000 6.000000 3.936638 6.000000 5.959322 3.932886 4.215385 4.707692

8 rows × 33 columns

Select columns


In [4]:
df=df_full[['d','r','x','y','z','error','error_csd','d_min','id','duplicate','fICA_corr']]

Get Quick statistics


In [5]:
df.describe()


Out[5]:
d r x y z error error_csd d_min id duplicate fICA_corr
count 7734.000000 7734.000000 7734.000000 7734.000000 7734.000000 6457.000000 7734.000000 7734.000000 7734.000000 7734.000000 6457.000000
mean 2.688305 1.031835 0.013495 0.020345 0.045718 1.575929 1.309659 1.638455 3732.299198 1.064779 0.906329
std 1.030901 0.562420 1.674770 1.666181 1.645288 1.182998 0.800133 0.723597 2168.291717 1.391563 0.234030
min 0.000509 0.050085 -3.951708 -3.917189 -3.858361 0.037339 0.043389 0.061151 0.000000 0.000000 -0.841047
25% 2.013362 0.538562 -1.169688 -1.151324 -1.114794 0.787596 0.704441 1.062434 1854.250000 1.000000 0.937136
50% 2.950472 1.037853 0.001709 0.011867 0.047713 1.269466 1.176192 1.648668 3716.500000 1.000000 0.961879
75% 3.545258 1.522593 1.199614 1.212545 1.215751 1.977636 1.719683 2.170632 5608.750000 1.000000 0.975996
max 3.999791 1.999741 3.879548 3.936638 3.932886 11.073811 5.044192 3.511029 7499.000000 10.000000 0.998263

In [6]:
df.head() # first 5 elements head(10) gives 10 etc
#same for tail but not the first but last 5


Out[6]:
d r x y z error error_csd d_min id duplicate fICA_corr
0 3.879864 0.758191 1.716371 -3.472934 0.214813 1.837613 1.638082 2.279648 5240 1 0.910044
1 3.967862 1.448772 -0.793211 3.510844 -1.669945 2.149533 1.794049 2.068963 4955 1 0.968209
10 3.062933 1.988753 -2.019346 1.124542 2.009777 5.094245 0.549863 1.882657 6485 5 -0.003062
100 3.930673 1.470187 -1.461876 0.740678 -3.572745 1.939445 1.760780 3.206235 6693 1 0.851529
1000 2.071114 0.350897 0.303692 -1.204260 1.657421 NaN 1.218462 1.437030 5911 0 NaN

And other relevant info


In [7]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 7734 entries, 0 to 999
Data columns (total 11 columns):
d            7734 non-null float64
r            7734 non-null float64
x            7734 non-null float64
y            7734 non-null float64
z            7734 non-null float64
error        6457 non-null float64
error_csd    7734 non-null float64
d_min        7734 non-null float64
id           7734 non-null int64
duplicate    7734 non-null int64
fICA_corr    6457 non-null float64
dtypes: float64(9), int64(2)
memory usage: 725.1 KB

cleaning up data


In [8]:
df=df.dropna() #get rid of NaNs
df.dropna(inplace=True) #get rid of NaNs
#df=df.fillna(0) # fill NaNs with any value

In [9]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 6457 entries, 0 to 999
Data columns (total 11 columns):
d            6457 non-null float64
r            6457 non-null float64
x            6457 non-null float64
y            6457 non-null float64
z            6457 non-null float64
error        6457 non-null float64
error_csd    6457 non-null float64
d_min        6457 non-null float64
id           6457 non-null int64
duplicate    6457 non-null int64
fICA_corr    6457 non-null float64
dtypes: float64(9), int64(2)
memory usage: 605.3 KB

In [10]:
df.loc[:,'bin_r'],bin_r=pd.cut(df['r'], np.arange(0,2.25,0.25), retbins=True)
df.loc[:,'bin_d'],bin_d=pd.cut(df['d'], np.arange(0,4.25,0.25), retbins=True)
# you could als do df['bin_d']=pd.cut(df['d'], np.arange(0,4.25,0.25))
# but this is not encouraged for some internal mechanisms reason

In [11]:
df.describe()


Out[11]:
d r x y z error error_csd d_min id duplicate fICA_corr
count 6457.000000 6457.000000 6457.000000 6457.000000 6457.000000 6457.000000 6457.000000 6457.000000 6457.000000 6457.000000 6457.000000
mean 2.614072 1.186414 0.013800 0.032420 0.058826 1.575929 1.240348 1.582573 3714.328326 1.275360 0.906329
std 1.046838 0.480229 1.647371 1.633516 1.594801 1.182998 0.778362 0.720614 2170.101003 1.432086 0.234030
min 0.000509 0.100053 -3.951708 -3.917189 -3.832222 0.037339 0.043389 0.061151 0.000000 1.000000 -0.841047
25% 1.933274 0.780910 -1.130986 -1.095630 -1.063142 0.787596 0.667807 1.001902 1831.000000 1.000000 0.937136
50% 2.861361 1.192417 0.004527 0.019658 0.060379 1.269466 1.082931 1.571060 3691.000000 1.000000 0.961879
75% 3.486199 1.600173 1.172744 1.180062 1.174021 1.977636 1.639797 2.108895 5593.000000 1.000000 0.975996
max 3.999465 1.999741 3.857658 3.936638 3.932886 11.073811 4.882667 3.511029 7499.000000 10.000000 0.998263

In [12]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 6457 entries, 0 to 999
Data columns (total 13 columns):
d            6457 non-null float64
r            6457 non-null float64
x            6457 non-null float64
y            6457 non-null float64
z            6457 non-null float64
error        6457 non-null float64
error_csd    6457 non-null float64
d_min        6457 non-null float64
id           6457 non-null int64
duplicate    6457 non-null int64
fICA_corr    6457 non-null float64
bin_r        6457 non-null category
bin_d        6457 non-null category
dtypes: category(2), float64(9), int64(2)
memory usage: 618.3 KB

In [13]:
# Sort by multiple columns
df.sort_values(['bin_r', 'd'], ascending=[True, False])


Object `pd.hist` not found.

In [ ]:
df['bin_d'].hist()

In [15]:
df.rename(columns={'bin_r' : 'r_binned','bin_d' : 'd_binned'}, inplace = True)

In [16]:
#df.info()
df_tabled=pd.crosstab(df['r_binned'],df['d_binned'])
df_tabled


Out[16]:
d_binned (0.0, 0.25] (0.25, 0.5] (0.5, 0.75] (0.75, 1.0] (1.0, 1.25] (1.25, 1.5] (1.5, 1.75] (1.75, 2.0] (2.0, 2.25] (2.25, 2.5] (2.5, 2.75] (2.75, 3.0] (3.0, 3.25] (3.25, 3.5] (3.5, 3.75] (3.75, 4.0]
r_binned
(0.0, 0.25] 7 11 10 3 1 1 4 1 4 4 0 0 0 0 0 0
(0.25, 0.5] 30 18 31 25 23 30 28 35 31 49 45 42 44 24 23 7
(0.5, 0.75] 23 40 21 31 20 31 48 56 64 61 68 82 108 99 111 113
(0.75, 1.0] 13 14 26 35 29 37 36 44 59 55 62 76 112 104 123 123
(1.0, 1.25] 26 23 22 21 35 31 39 48 61 65 80 85 89 113 129 131
(1.25, 1.5] 12 17 28 17 18 34 46 57 52 52 76 94 119 104 122 140
(1.5, 1.75] 13 20 25 28 29 40 37 53 58 53 75 88 96 122 138 138
(1.75, 2.0] 25 19 17 28 29 38 40 51 62 61 65 90 94 105 134 145

In [17]:
#plt.pcolor(df_tabled.values)
sns.heatmap(df_tabled)
plt.ylim((0,8))


Out[17]:
(0, 8)

In [18]:
df.loc[:,'distance']=np.sqrt(np.square(df['x'])+np.square(df['y'])+np.square(df['z']))
df[['d','distance']].describe()


Out[18]:
d distance
count 6457.000000 6457.000000
mean 2.614072 2.614072
std 1.046838 1.046838
min 0.000509 0.000509
25% 1.933274 1.933274
50% 2.861361 2.861361
75% 3.486199 3.486199
max 3.999465 3.999465

In [19]:
def calc_phi(df):
    return np.arctan(df['y']/df['x'])
def calc_theta(df):
    return np.arccos(df['z']/df['d'])

df['phi'] = df.apply(calc_phi,axis=1) # works on row pd.map works on element

df['theta'] = df.apply(calc_theta,axis=1)
df[['theta','phi']].describe()
df['theta']=calc_theta(df)

In [20]:
ds_string = df['phi'].map('this is a string describing phi = {}'.format)
#ds_string

In [21]:
#df.cov()

In [22]:
df.corr()


Out[22]:
d r x y z error error_csd d_min id duplicate fICA_corr distance phi theta
d 1.000000 0.133720 -0.000285 0.016805 0.006853 0.348398 0.616879 0.808210 0.014282 -0.003054 -0.014891 1.000000 -0.002774 0.009477
r 0.133720 1.000000 0.007779 0.015617 0.004336 -0.070145 -0.094045 0.148783 0.014595 -0.028220 0.051425 0.133720 0.012504 0.000342
x -0.000285 0.007779 1.000000 -0.013894 -0.019962 0.011081 0.012282 0.002735 -0.033999 0.063019 -0.042690 -0.000285 0.011474 0.017928
y 0.016805 0.015617 -0.013894 1.000000 -0.019258 0.034793 -0.014079 0.008685 -0.015893 0.089638 -0.068241 0.016805 -0.004711 0.009937
z 0.006853 0.004336 -0.019962 -0.019258 1.000000 0.006350 0.010143 0.030587 0.007834 -0.010160 0.000693 0.006853 -0.003219 -0.923641
error 0.348398 -0.070145 0.011081 0.034793 0.006350 1.000000 0.315292 0.299346 -0.019447 0.405917 -0.514807 0.348398 -0.018494 -0.008324
error_csd 0.616879 -0.094045 0.012282 -0.014079 0.010143 0.315292 1.000000 0.661353 -0.006726 0.046735 -0.053741 0.616879 -0.009884 0.000439
d_min 0.808210 0.148783 0.002735 0.008685 0.030587 0.299346 0.661353 1.000000 0.006008 -0.021887 -0.034805 0.808210 0.000456 -0.016603
id 0.014282 0.014595 -0.033999 -0.015893 0.007834 -0.019447 -0.006726 0.006008 1.000000 -0.062565 0.042841 0.014282 0.024165 -0.006378
duplicate -0.003054 -0.028220 0.063019 0.089638 -0.010160 0.405917 0.046735 -0.021887 -0.062565 1.000000 -0.801746 -0.003054 -0.036562 -0.004559
fICA_corr -0.014891 0.051425 -0.042690 -0.068241 0.000693 -0.514807 -0.053741 -0.034805 0.042841 -0.801746 1.000000 -0.014891 0.028283 0.011986
distance 1.000000 0.133720 -0.000285 0.016805 0.006853 0.348398 0.616879 0.808210 0.014282 -0.003054 -0.014891 1.000000 -0.002774 0.009477
phi -0.002774 0.012504 0.011474 -0.004711 -0.003219 -0.018494 -0.009884 0.000456 0.024165 -0.036562 0.028283 -0.002774 1.000000 0.005772
theta 0.009477 0.000342 0.017928 0.009937 -0.923641 -0.008324 0.000439 -0.016603 -0.006378 -0.004559 0.011986 0.009477 0.005772 1.000000

In [23]:
df.hist(figsize=(16,9))


Out[23]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f568c5c0690>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f568c4d0710>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f568c4f9510>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f568c3b9c50>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f568c2ede50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f568b1a3c90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f568b127e90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f568b0a7450>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f568b02c650>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f568af90c10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f568af12f10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f568ae90650>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f568ae12a10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f568aeb6c50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f568ad10610>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f568ad46bd0>]], dtype=object)

In [24]:
df_full[['r','d','error']].values


Out[24]:
array([[ 0.75819087,  3.87986402,  1.83761268],
       [ 1.44877152,  3.96786212,  2.14953342],
       [ 1.98875272,  3.06293275,  5.09424485],
       ..., 
       [ 0.7815668 ,  3.51740699,  1.43725949],
       [ 1.65716911,  3.20431109,  1.08022274],
       [ 0.37189041,  0.96907892,  0.42407218]])

In [25]:
print df_full['id'].nunique()
print df_full['id'].count()


7500
7734


In [ ]:


In [26]:
grouped_table_2=pd.pivot_table(df,index='r_binned',columns='d_binned',values='error_csd',aggfunc=np.median)
grouped_table_2


Out[26]:
d_binned (0.0, 0.25] (0.25, 0.5] (0.5, 0.75] (0.75, 1.0] (1.0, 1.25] (1.25, 1.5] (1.5, 1.75] (1.75, 2.0] (2.0, 2.25] (2.25, 2.5] (2.5, 2.75] (2.75, 3.0] (3.0, 3.25] (3.25, 3.5] (3.5, 3.75] (3.75, 4.0]
r_binned
(0.0, 0.25] 0.385211 0.194883 0.514749 0.353590 0.123352 0.770140 0.965110 0.638443 0.633961 0.912416 NaN NaN NaN NaN NaN NaN
(0.25, 0.5] 0.411896 0.476062 0.509952 0.548505 0.450938 0.799059 0.702002 0.691470 1.008340 0.836751 0.897178 1.215698 1.246646 1.317212 1.639797 1.697519
(0.5, 0.75] 0.201903 0.480300 0.519431 0.334592 0.313419 0.514622 0.703414 0.890496 0.798687 0.938890 1.046663 1.221729 1.521117 1.563107 1.734664 1.903729
(0.75, 1.0] 0.190983 0.340893 0.426403 0.498093 0.563822 0.552548 0.665183 0.957262 1.094717 0.895096 1.166072 1.185778 1.417468 1.475576 1.687400 1.947446
(1.0, 1.25] 0.219799 0.335944 0.386041 0.544931 0.606814 0.653061 0.686339 0.808415 1.036159 1.144634 0.922160 1.300359 1.270011 1.614623 1.669495 1.850429
(1.25, 1.5] 0.192484 0.283038 0.367474 0.510043 0.598251 0.680024 0.695958 0.689189 0.835914 1.174245 0.809702 1.293803 1.173598 1.449069 1.491681 1.753323
(1.5, 1.75] 0.167978 0.253615 0.369281 0.495651 0.574360 0.594012 0.711916 0.735753 0.774486 0.825196 0.915336 1.028795 1.156366 1.254526 1.489816 1.592393
(1.75, 2.0] 0.154738 0.252809 0.353848 0.466284 0.535190 0.621890 0.664028 0.697334 0.730166 0.795943 0.999408 0.879684 0.952633 1.152984 1.280880 1.382828

In [27]:
grouped_table=df.groupby(['r_binned','d_binned'])['error'].quantile(0.5).unstack().fillna(5)
grouped_table


Out[27]:
d_binned (0.0, 0.25] (0.25, 0.5] (0.5, 0.75] (0.75, 1.0] (1.0, 1.25] (1.25, 1.5] (1.5, 1.75] (1.75, 2.0] (2.0, 2.25] (2.25, 2.5] (2.5, 2.75] (2.75, 3.0] (3.0, 3.25] (3.25, 3.5] (3.5, 3.75] (3.75, 4.0]
r_binned
(0.0, 0.25] 0.348818 0.515666 0.512044 0.601704 0.550679 0.607051 2.304251 0.332262 1.054505 1.196965 5.000000 5.000000 5.000000 5.000000 5.000000 5.000000
(0.25, 0.5] 0.383769 0.510202 0.557029 0.549368 0.457071 0.648954 0.819869 0.888712 1.537671 1.064853 1.180558 1.773606 2.478910 2.132291 2.318295 1.772829
(0.5, 0.75] 0.560478 0.543448 0.534780 0.486380 0.408188 0.649900 0.894788 1.126206 2.028397 1.366619 1.482923 1.526092 1.807070 1.681350 1.887857 2.120417
(0.75, 1.0] 0.424731 0.457213 0.508265 0.532918 0.642606 0.689923 0.765819 0.895502 1.246072 1.351933 1.408471 1.600421 1.821625 1.550840 1.634189 1.885405
(1.0, 1.25] 0.473409 0.482182 0.701874 0.525087 0.668062 0.713152 0.710952 1.022604 1.199535 1.292614 1.313893 1.314130 1.569950 1.463129 1.700093 1.844464
(1.25, 1.5] 0.605319 0.543735 0.510814 0.523623 0.673911 1.078833 0.825453 1.016311 1.156912 1.369499 1.419654 1.460203 1.427639 1.438949 1.545851 1.796441
(1.5, 1.75] 0.778568 0.789567 0.454605 0.493477 0.705928 0.750550 0.714354 1.038921 1.063042 1.208572 1.173602 1.424514 1.512793 1.674025 1.679612 1.647186
(1.75, 2.0] 0.620124 0.776067 0.844912 0.813648 0.847595 0.852697 1.001284 1.087279 1.175648 1.347983 1.236558 1.251298 1.222735 1.322535 1.481052 1.448258

In [28]:
df_sub=df[['r_binned','d_binned','error','error_csd']]

melted_table=pd.melt(df_sub, id_vars=["r_binned",'d_binned'], var_name="error type", value_name="error value")
melted_table=melted_table.sort_values(by=['r_binned','d_binned'])
melted_table


Out[28]:
r_binned d_binned error type error value
1845 (0.0, 0.25] (0.0, 0.25] error 0.453886
2403 (0.0, 0.25] (0.0, 0.25] error 0.103426
2757 (0.0, 0.25] (0.0, 0.25] error 0.228098
3460 (0.0, 0.25] (0.0, 0.25] error 0.361611
4051 (0.0, 0.25] (0.0, 0.25] error 0.348818
5360 (0.0, 0.25] (0.0, 0.25] error 0.647600
5965 (0.0, 0.25] (0.0, 0.25] error 0.110844
8302 (0.0, 0.25] (0.0, 0.25] error_csd 0.447243
8860 (0.0, 0.25] (0.0, 0.25] error_csd 0.212210
9214 (0.0, 0.25] (0.0, 0.25] error_csd 0.211820
9917 (0.0, 0.25] (0.0, 0.25] error_csd 0.644209
10508 (0.0, 0.25] (0.0, 0.25] error_csd 0.385211
11817 (0.0, 0.25] (0.0, 0.25] error_csd 0.085178
12422 (0.0, 0.25] (0.0, 0.25] error_csd 0.385845
1243 (0.0, 0.25] (0.25, 0.5] error 0.231588
1527 (0.0, 0.25] (0.25, 0.5] error 0.527470
1703 (0.0, 0.25] (0.25, 0.5] error 0.634052
2155 (0.0, 0.25] (0.25, 0.5] error 0.631949
2331 (0.0, 0.25] (0.25, 0.5] error 0.494558
5381 (0.0, 0.25] (0.25, 0.5] error 0.428364
5908 (0.0, 0.25] (0.25, 0.5] error 0.679245
6023 (0.0, 0.25] (0.25, 0.5] error 0.174270
6078 (0.0, 0.25] (0.25, 0.5] error 0.515666
6259 (0.0, 0.25] (0.25, 0.5] error 1.275784
6274 (0.0, 0.25] (0.25, 0.5] error 0.351359
7700 (0.0, 0.25] (0.25, 0.5] error_csd 0.152232
7984 (0.0, 0.25] (0.25, 0.5] error_csd 0.546019
8160 (0.0, 0.25] (0.25, 0.5] error_csd 0.194883
8612 (0.0, 0.25] (0.25, 0.5] error_csd 0.147728
8788 (0.0, 0.25] (0.25, 0.5] error_csd 0.482387
... ... ... ... ...
11806 (1.75, 2.0] (3.75, 4.0] error_csd 2.088336
11824 (1.75, 2.0] (3.75, 4.0] error_csd 1.203810
11875 (1.75, 2.0] (3.75, 4.0] error_csd 2.749728
11898 (1.75, 2.0] (3.75, 4.0] error_csd 1.051744
12003 (1.75, 2.0] (3.75, 4.0] error_csd 1.394073
12011 (1.75, 2.0] (3.75, 4.0] error_csd 1.335654
12069 (1.75, 2.0] (3.75, 4.0] error_csd 0.850741
12077 (1.75, 2.0] (3.75, 4.0] error_csd 1.206773
12081 (1.75, 2.0] (3.75, 4.0] error_csd 0.861405
12110 (1.75, 2.0] (3.75, 4.0] error_csd 1.800268
12126 (1.75, 2.0] (3.75, 4.0] error_csd 1.214933
12127 (1.75, 2.0] (3.75, 4.0] error_csd 1.378747
12137 (1.75, 2.0] (3.75, 4.0] error_csd 1.525103
12182 (1.75, 2.0] (3.75, 4.0] error_csd 1.203179
12212 (1.75, 2.0] (3.75, 4.0] error_csd 0.979543
12257 (1.75, 2.0] (3.75, 4.0] error_csd 1.468328
12260 (1.75, 2.0] (3.75, 4.0] error_csd 2.268446
12329 (1.75, 2.0] (3.75, 4.0] error_csd 1.624673
12342 (1.75, 2.0] (3.75, 4.0] error_csd 1.088875
12371 (1.75, 2.0] (3.75, 4.0] error_csd 1.131006
12510 (1.75, 2.0] (3.75, 4.0] error_csd 1.314477
12511 (1.75, 2.0] (3.75, 4.0] error_csd 1.086530
12563 (1.75, 2.0] (3.75, 4.0] error_csd 1.386746
12574 (1.75, 2.0] (3.75, 4.0] error_csd 1.759977
12681 (1.75, 2.0] (3.75, 4.0] error_csd 1.571346
12714 (1.75, 2.0] (3.75, 4.0] error_csd 2.564571
12715 (1.75, 2.0] (3.75, 4.0] error_csd 1.119898
12725 (1.75, 2.0] (3.75, 4.0] error_csd 0.541611
12840 (1.75, 2.0] (3.75, 4.0] error_csd 0.755731
12883 (1.75, 2.0] (3.75, 4.0] error_csd 2.073395

12914 rows × 4 columns


In [29]:
plt.figure(figsize=(16,9))
sns.boxplot(data=melted_table,x='d_binned',y='error value',hue='error type')


Out[29]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f568a007a10>

In [30]:
df_2D=df.groupby(['r_binned','d_binned'])['error']
for name,group in df_2D:
    #print name,group
    print group.quantile(0.5)-group.mean()


0.0267774749429
-0.0247252997182
-0.03961914364
-0.0388722451667
0.0
0.0
-0.210612506925
0.0
-0.04680898855
-0.3901555904
-0.557486587913
-0.0234893847111
0.0155329232161
-0.0401936671
-0.0209197213304
-0.172856122113
-0.494706707875
-0.582596266283
-0.193150634945
-0.371442810135
-0.554540274749
-0.4239563632
-0.345156226186
-0.441742566658
-0.106694918939
-0.417711360257
-1.30933008073
-0.742735859788
0.000710699090476
-0.00212322674194
-0.010270298565
-0.180145715745
-0.249612225396
-0.614614414105
-0.351785110364
-0.363084703354
-0.224066268235
-0.49482405112
-0.461981372609
-0.378950202832
-0.52298921829
-0.333621995609
-0.0611685649154
-0.0739710767714
-0.172005639465
-0.681862934766
-0.0151044492276
-0.011493484273
-0.145308067183
-0.109898307359
-0.171649970942
-0.167221607391
-0.384292461276
-0.199065329887
-0.32149523378
-0.319105026674
-0.279114070209
-0.156551220816
-0.0472068271346
-0.152405974765
-0.0620058462091
-0.0227965055048
-0.206875774814
-0.0379778457774
-0.0429210144385
-0.0415456185229
-0.00407174488689
-0.120037290914
-0.236117236429
-0.440101608898
-0.365242680773
-0.256445354732
-0.326871645647
-0.328669791221
-0.0817537217667
-0.119118512753
-0.161997097889
-0.0455909217235
-0.0683194189722
-0.500309224241
-0.144811723022
0.0246587163509
-0.09008694175
0.0845392126538
-0.354698858051
-0.180486625487
-0.358015010913
-0.191364567317
-0.148853792084
-0.0851452133
-0.111589054538
0.023439621885
-0.226850143756
-0.113639411929
-0.109184702686
-0.24264902374
-0.0884708710486
-0.10160069947
-0.240696373829
-0.281771783958
-0.321340160719
-0.105055365632
-0.290252356801
-0.401590432893
-0.102197076554
-0.182304972563
-0.178092187444
-0.0118016132053
-0.183598360935
-0.0297424070286
0.0232481820448
-0.262639933482
-0.296618347485
-0.505403610269
-0.195522741356
-0.472708524836
-0.199443161768
-0.102174759582
-0.339148542459
-0.162337600133
-0.0878332018925
-0.267197442277

In [ ]:


In [ ]:


In [ ]:


In [31]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 6457 entries, 0 to 999
Data columns (total 16 columns):
d            6457 non-null float64
r            6457 non-null float64
x            6457 non-null float64
y            6457 non-null float64
z            6457 non-null float64
error        6457 non-null float64
error_csd    6457 non-null float64
d_min        6457 non-null float64
id           6457 non-null int64
duplicate    6457 non-null int64
fICA_corr    6457 non-null float64
r_binned     6457 non-null category
d_binned     6457 non-null category
distance     6457 non-null float64
phi          6457 non-null float64
theta        6457 non-null float64
dtypes: category(2), float64(12), int64(2)
memory usage: 1.1 MB

In [32]:
df_2D=(
    df.groupby(['r_binned','d_binned'])['error']
      ).quantile(0.5
      ).unstack(fill_value=2.50
      ).where(df_tabled>7,2.5)

sns.heatmap(df_2D,cmap='Greys',vmin=0,vmax=2.5)
plt.ylim([0,8])


Out[32]:
(0, 8)

In [33]:
df_missed=df_full.loc[df_full['duplicate']==0]
g = (sns.jointplot(x="r", y="d",
                   data=df_missed, color="k",kind='hex'))
g.ax_joint.set(xlim=(0, 1.05),ylim=(0,4.1))


Out[33]:
[(0, 4.1), (0, 1.05)]

In [34]:
g = (sns.jointplot(x="r", y="d",
                   data=df_missed, color="k",kind='kde'))
g.ax_joint.set(xlim=(0, 1.05),ylim=(0,4.1))


Out[34]:
[(0, 4.1), (0, 1.05)]

In [35]:
g = sns.PairGrid(data=df,vars=['phi','theta','error'])
g = g.map_upper(plt.scatter)
g = g.map_lower(sns.kdeplot, cmap="Blues_d")
#g = g.map_diag(sns.kdeplot, lw=3, legend=False)
g = g.map_diag(plt.hist)
g


Out[35]:
<seaborn.axisgrid.PairGrid at 0x7f5686c3a510>

double entries?


In [ ]:


In [38]:
plt.figure(figsize=(16,9))
sns.boxplot(data=df,x='r_binned',y='error')


Out[38]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f56871cff10>

In [39]:
plt.figure(figsize=(16,9))

sns.boxplot(data=df,x='r_binned',y='error',hue='d_binned')


Out[39]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f56871cffd0>

In [40]:
plt.figure(figsize=(16,9))
sns.swarmplot(data=df,x='r_binned',y='error')


Out[40]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f56848aac50>

In [41]:
sns.violinplot(data=df,x='r_binned',y='error')


Out[41]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f56848aa250>

In [42]:
sns.violinplot(data=melted_table[:],x='r_binned',y='error value',hue='error type',split=True)


Out[42]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f567f76ad10>

In [ ]:


In [ ]:


In [ ]:


In [ ]: